In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing dataset
df = pd.read_csv("diabetes.csv")
print(df.head())
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
In [4]:
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
In [18]:
print(df.describe)
<bound method NDFrame.describe of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                       0.351   31        0  
2                       0.672   32        1  
3                       0.167   21        0  
4                       2.288   33        1  
..                        ...  ...      ...  
763                     0.171   63        0  
764                     0.340   27        0  
765                     0.245   30        0  
766                     0.349   47        1  
767                     0.315   23        0  

[768 rows x 9 columns]>
In [6]:
print(df.isnull().sum())
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [7]:
# Summary statistics for the dataset
summary_statistics = df.describe()
print(summary_statistics)
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  0.078000   21.000000    0.000000  
25%     27.300000                  0.243750   24.000000    0.000000  
50%     32.000000                  0.372500   29.000000    0.000000  
75%     36.600000                  0.626250   41.000000    1.000000  
max     67.100000                  2.420000   81.000000    1.000000  
In [10]:
# Measures of central tendency
mean_values = df.mean()
median_values = df.median()
mode_values = df.mode().iloc[0]

# Measures of dispersion
std_dev_values = df.std()
variance_values = df.var()
range_values = df.max() - df.min()

# Display these measures
print("Mean Values:\n", mean_values)
Mean Values:
 Pregnancies                   3.845052
Glucose                     120.894531
BloodPressure                69.105469
SkinThickness                20.536458
Insulin                      79.799479
BMI                          31.992578
DiabetesPedigreeFunction      0.471876
Age                          33.240885
Outcome                       0.348958
dtype: float64
In [11]:
print("Median Values:\n", median_values)
Median Values:
 Pregnancies                   3.0000
Glucose                     117.0000
BloodPressure                72.0000
SkinThickness                23.0000
Insulin                      30.5000
BMI                          32.0000
DiabetesPedigreeFunction      0.3725
Age                          29.0000
Outcome                       0.0000
dtype: float64
In [12]:
print("Mode Values:\n", mode_values)
Mode Values:
 Pregnancies                  1.000
Glucose                     99.000
BloodPressure               70.000
SkinThickness                0.000
Insulin                      0.000
BMI                         32.000
DiabetesPedigreeFunction     0.254
Age                         22.000
Outcome                      0.000
Name: 0, dtype: float64
In [13]:
print("Standard Deviation Values:\n", std_dev_values)
Standard Deviation Values:
 Pregnancies                   3.369578
Glucose                      31.972618
BloodPressure                19.355807
SkinThickness                15.952218
Insulin                     115.244002
BMI                           7.884160
DiabetesPedigreeFunction      0.331329
Age                          11.760232
Outcome                       0.476951
dtype: float64
In [14]:
print("Variance Values:\n", variance_values)
Variance Values:
 Pregnancies                    11.354056
Glucose                      1022.248314
BloodPressure                 374.647271
SkinThickness                 254.473245
Insulin                     13281.180078
BMI                            62.159984
DiabetesPedigreeFunction        0.109779
Age                           138.303046
Outcome                         0.227483
dtype: float64
In [15]:
print("Range Values:\n", range_values)
Range Values:
 Pregnancies                  17.000
Glucose                     199.000
BloodPressure               122.000
SkinThickness                99.000
Insulin                     846.000
BMI                          67.100
DiabetesPedigreeFunction      2.342
Age                          60.000
Outcome                       1.000
dtype: float64
In [16]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

#  outliers for each column
outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))
print(outliers.sum())
Pregnancies                  4
Glucose                      5
BloodPressure               45
SkinThickness                1
Insulin                     34
BMI                         19
DiabetesPedigreeFunction    29
Age                          9
Outcome                      0
dtype: int64
In [17]:
# Exploring missing values
df.isnull().sum()
Out[17]:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [3]:
# Show top 5 rows
df.head(20)
Out[3]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
5 5 116 74 0 0 25.6 0.201 30 0
6 3 78 50 32 88 31.0 0.248 26 1
7 10 115 0 0 0 35.3 0.134 29 0
8 2 197 70 45 543 30.5 0.158 53 1
9 8 125 96 0 0 0.0 0.232 54 1
10 4 110 92 0 0 37.6 0.191 30 0
11 10 168 74 0 0 38.0 0.537 34 1
12 10 139 80 0 0 27.1 1.441 57 0
13 1 189 60 23 846 30.1 0.398 59 1
14 5 166 72 19 175 25.8 0.587 51 1
15 7 100 0 0 0 30.0 0.484 32 1
16 0 118 84 47 230 45.8 0.551 31 1
17 7 107 74 0 0 29.6 0.254 31 1
18 1 103 30 38 83 43.3 0.183 33 0
19 1 115 70 30 96 34.6 0.529 32 1
In [19]:
corr = df.corr()
sns.heatmap(corr, annot=True, square=True)
plt.show()
No description has been provided for this image
In [14]:
df.groupby('Outcome').mean()
Out[14]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
Outcome
0 3.298000 109.980000 68.184000 19.664000 68.792000 30.304200 0.429734 31.190000
1 4.865672 141.257463 70.824627 22.164179 100.335821 35.142537 0.550500 37.067164
In [16]:
df.groupby('Outcome').mean().T.plot(figsize=(12,4))
Out[16]:
<Axes: >
No description has been provided for this image
In [18]:
sns.countplot(x='Outcome',data=df)
Out[18]:
<Axes: xlabel='Outcome', ylabel='count'>
No description has been provided for this image
In [29]:
# Box plots to identify outliers
plt.figure(figsize=(10, 5))
sns.boxplot(df)
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [30]:
# Define a function to remove outliers based on the IQR method
def remove_outliers(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# List of columns from which to remove outliers
columns_to_check = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

# Remove outliers
df_clean = remove_outliers(df, columns_to_check)

# Display the shape of the dataset before and after removing outliers
print("Original dataset shape:", df.shape)
print("Cleaned dataset shape:", df_clean.shape)
Original dataset shape: (768, 9)
Cleaned dataset shape: (636, 9)
In [32]:
# Box plots to identify outliers
plt.figure(figsize=(10, 5))
sns.boxplot(df_clean)
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [43]:
sns.pairplot(data=df_clean,kind='scatter')
plt.savefig('output.png') # Save that figure
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
In [44]:
sns.pairplot(data=df_clean,hue='Outcome')
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
Out[44]:
<seaborn.axisgrid.PairGrid at 0x25e3f5ecc90>
No description has been provided for this image
In [46]:
sns.scatterplot(x='Glucose',y='Insulin',data=df_clean)
plt.show()
No description has been provided for this image
In [19]:
diabetes = df[df['Outcome'] == 1]
no_diabetes = df[df['Outcome'] == 0]

# Display the first few rows of each group
diabetes.head(), no_diabetes.head()
Out[19]:
(   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 0            6      148             72             35        0  33.6   
 2            8      183             64              0        0  23.3   
 4            0      137             40             35      168  43.1   
 6            3       78             50             32       88  31.0   
 8            2      197             70             45      543  30.5   
 
    DiabetesPedigreeFunction  Age  Outcome  
 0                     0.627   50        1  
 2                     0.672   32        1  
 4                     2.288   33        1  
 6                     0.248   26        1  
 8                     0.158   53        1  ,
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 1             1       85             66             29        0  26.6   
 3             1       89             66             23       94  28.1   
 5             5      116             74              0        0  25.6   
 7            10      115              0              0        0  35.3   
 10            4      110             92              0        0  37.6   
 
     DiabetesPedigreeFunction  Age  Outcome  
 1                      0.351   31        0  
 3                      0.167   21        0  
 5                      0.201   30        0  
 7                      0.134   29        0  
 10                     0.191   30        0  )
In [21]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

# Extract glucose levels for both groups
glucose_with_diabetes = diabetes['Glucose']
glucose_without_diabetes = no_diabetes['Glucose']

# Perform an independent t-test
t_stat, p_value = stats.ttest_ind(glucose_with_diabetes, glucose_without_diabetes)

t_stat, p_value
Out[21]:
(14.600060005973894, 8.935431645289913e-43)
In [22]:
# Extract BMI for both groups
bmi_with_diabetes = diabetes['BMI']
bmi_without_diabetes = no_diabetes['BMI']

# Perform an independent t-test
t_stat_bmi, p_value_bmi = stats.ttest_ind(bmi_with_diabetes, bmi_without_diabetes)

t_stat_bmi, p_value_bmi
Out[22]:
(8.47183994786525, 1.2298074873116022e-16)
In [23]:
# Extract age for both groups
age_with_diabetes = diabetes['Age']
age_without_diabetes = no_diabetes['Age']

# Perform an independent t-test
t_stat_age, p_value_age = stats.ttest_ind(age_with_diabetes, age_without_diabetes)

t_stat_age, p_value_age
Out[23]:
(6.792688071649956, 2.2099754606654358e-11)
In [24]:
# Extract the number of pregnancies for both groups
preg_with_diabetes = diabetes['Pregnancies']
preg_without_diabetes = no_diabetes['Pregnancies']

# Perform an independent t-test
t_stat_preg, p_value_preg = stats.ttest_ind(preg_with_diabetes, preg_without_diabetes)

print(f'Number of Pregnancies - t-statistic: {t_stat_preg}, p-value: {p_value_preg}')
Number of Pregnancies - t-statistic: 6.298430550035151, p-value: 5.065127298053476e-10